Load the data

load("covariates.RData")
load("Disease.RData")

dat <- merge(x = full_data, y = disease[c("SEQN", "bp", "bi", "cd")], 
             by = "SEQN", all = TRUE)

dat <- dat[complete.cases(dat), ]
dat$ncd <- dat$bp == 1 | dat$bi == 1 | dat$cd == 1

dat <- dat[dat$ALQ151 != "Don't know", ]
dat <- dat[dat$DBQ700 != "Don't know", ]
dat$ALQ151 <- factor(dat$ALQ151)
dat$SLQ050 <- factor(dat$SLQ050)
dat$DBQ700 <- factor(dat$DBQ700)

dat$DBQ700 = relevel(dat$DBQ700, ref = "Excellent")
dat$ALQ151 = relevel(dat$ALQ151, ref = "Yes")
dat$SLQ050 = relevel(dat$SLQ050, ref = "Yes")

head(dat)

Data Summary

Numerically Summary

dat$ncd_p <- factor(dat$ncd, levels = c("TRUE", "FALSE", "P"), labels = c("TRUE", "FALSE", "P-value"))

rndr <- function(x, name, ...){
  if (length(x) == 0){
    y <- dat[[name]]
    s <- rep("", length(render.default(x = y, name = name)))
    if (is.numeric(y)){
      p <- t.test(y ~ dat$ncd_p)$p.value
    } else{
      p <- chisq.test(table(y, droplevels(dat$ncd_p)))$p.value
    }
    s[2] <- sub("<", "&lt;", format.pval(p, digits = 3, eps = 0.001))
    s
  } else{
    render.default(x = x, name = name, ...)
  }
}

rndr.strat <- function(label, n, ...){
  ifelse(n == 0, label, render.strat.default(label, n, ...))
}

table1(~ DBQ700 + ALQ120Q + ALQ151 + DR1TVC + DR1TCAFF + SLD012 + SLQ050 + 
         SMD030 + WHD020 + bmi| ncd_p, data = dat, droplevels = F, 
       render = rndr, render.strat = rndr.strat, overall = F)
TRUE
(n=833)
FALSE
(n=301)
P-value
DBQ700
Excellent 76 (9.1%) 38 (12.6%) 0.00578
Very good 156 (18.7%) 74 (24.6%)
Good 319 (38.3%) 119 (39.5%)
Fair 212 (25.5%) 53 (17.6%)
Poor 70 (8.4%) 17 (5.6%)
ALQ120Q
Mean (SD) 2.85 (13.4) 3.09 (4.49) 0.648
Median [Min, Max] 1.00 [0.00, 365] 2.00 [0.00, 60.0]
ALQ151
Yes 266 (31.9%) 71 (23.6%) 0.00825
No 567 (68.1%) 230 (76.4%)
DR1TVC
Mean (SD) 70.7 (80.7) 75.2 (94.6) 0.465
Median [Min, Max] 41.7 [0.00, 921] 42.8 [0.00, 745]
DR1TCAFF
Mean (SD) 207 (258) 232 (360) 0.264
Median [Min, Max] 144 [0.00, 2950] 156 [0.00, 4530]
SLD012
Mean (SD) 7.64 (1.68) 7.57 (1.38) 0.488
Median [Min, Max] 8.00 [2.00, 13.5] 7.50 [3.00, 11.5]
SLQ050
Yes 341 (40.9%) 69 (22.9%) <0.001
No 492 (59.1%) 232 (77.1%)
SMD030
Mean (SD) 18.2 (7.15) 18.8 (6.60) 0.251
Median [Min, Max] 17.0 [0.00, 58.0] 18.0 [0.00, 54.0]
WHD020
Mean (SD) 189 (43.9) 171 (35.2) <0.001
Median [Min, Max] 182 [78.0, 386] 165 [101, 300]
bmi
Mean (SD) 28.7 (7.29) 26.2 (5.99) <0.001
Median [Min, Max] 28.2 [0.000956, 64.2] 25.8 [0.00105, 53.7]

Graphical Summary

Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Draw Boxplots for the continuous variables)

ggboxplot(data = dat, x = "ncd", y = "ALQ120Q", add = "jitter", color = "ncd") + 
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("How often drink alcohol over past 12 mos") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "DR1TVC", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Vitamin C (mg)") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "DR1TCAFF", add = "jitter", color = "ncd")+
  theme(legend.position = "none") +
  xlab("Having one or more diseases") +
  ylab("Caffeine (mg)") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "SLD012", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Sleep hours") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "SMD030", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Age started smoking cigarettes regularly") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "WHD020", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Current self-reported weight (pounds)") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat, x = "ncd", y = "bmi", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("BMI") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Draw Mosaic plots for discrete variables)

pval <- function(p){
  if (p < 0.001) {
    return("< 0.001")
    } else {return(round(p, 3))}
}

ggplot(data = dat) + 
  geom_mosaic(aes(x = product(DBQ700, ncd), fill = DBQ700)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Diet") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$DBQ700, correct = F)$p.value)))

ggplot(data = dat) + 
  geom_mosaic(aes(x = product(ALQ151, ncd), fill = ALQ151)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Alcohol") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$ALQ151, correct = F)$p.value)))

ggplot(data = dat) + 
  geom_mosaic(aes(x = product(SLQ050, ncd), fill = SLQ050)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Having trouble sleeping") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$SLQ050, correct = F)$p.value)))

Test

Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Use two sample t-test with two-sided 0.05 significance level to compare the mean level of each variable between two NCDs outcomes)

t.test(dat$ncd,dat$ALQ120Q)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$ALQ120Q
## t = -6.2825, df = 1136.2, p-value = 4.736e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.861839 -1.499713
## sample estimates:
## mean of x mean of y 
## 0.7345679 2.9153439
t.test(dat$ncd,dat$DR1TVC)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$DR1TVC
## t = -28.327, df = 1133.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -76.05595 -66.20261
## sample estimates:
##  mean of x  mean of y 
##  0.7345679 71.8638448
t.test(dat$ncd,dat$DR1TCAFF)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$DR1TCAFF
## t = -24.809, df = 1133, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -229.6009 -195.9459
## sample estimates:
##   mean of x   mean of y 
##   0.7345679 213.5079365
t.test(dat$ncd,dat$SLD012)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$SLD012
## t = -139.12, df = 1303.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.984244 -6.790006
## sample estimates:
## mean of x mean of y 
## 0.7345679 7.6216931
t.test(dat$ncd,dat$SMD030)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$SMD030
## t = -84.556, df = 1142, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.04232 -17.22400
## sample estimates:
##  mean of x  mean of y 
##  0.7345679 18.3677249
t.test(dat$ncd,dat$WHD020)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$WHD020
## t = -145.1, df = 1133.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -185.6006 -180.6481
## sample estimates:
##   mean of x   mean of y 
##   0.7345679 183.8589065
t.test(dat$ncd,dat$bmi)
## 
##  Welch Two Sample t-test
## 
## data:  dat$ncd and dat$bmi
## t = -130.06, df = 1141.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -27.71861 -26.89471
## sample estimates:
##  mean of x  mean of y 
##  0.7345679 28.0412255

Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Use Chi-square tests to compare the proportions of diagnosed with NCDs for each discrete variable)

chisq.test(dat$ncd,dat$DBQ700)
## 
##  Pearson's Chi-squared test
## 
## data:  dat$ncd and dat$DBQ700
## X-squared = 14.531, df = 4, p-value = 0.00578
chisq.test(dat$ncd,dat$ALQ151)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dat$ncd and dat$ALQ151
## X-squared = 6.9775, df = 1, p-value = 0.008254
chisq.test(dat$ncd,dat$SLQ050)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dat$ncd and dat$SLQ050
## X-squared = 30.303, df = 1, p-value = 3.695e-08

Generalized Linear Regression

Univariate Regression

# Univariate logistic regression
colnames(dat) <- c("SEQN","Diet","Alcohol_freq","Alcohol",
                   "Vitamin_C","Caffeine","Sleep","Trouble_sleep",
                   "Smoking_age","smok_yn","Weight","BMI","bp","bi","cd","ncd","ncd_p")
univ = function(var)
{
  form = formula(paste0("ncd ~ ", var))
  logit = glm(form, data = dat, family = "binomial")
  
  p = coef(summary(logit))[,4]
  
  OR = data.frame(round(exp(cbind(OR = coef(logit), confint(logit))), 3),
                  p_value = format.pval(p, digits = 3, eps = 0.001))[-1,]
  colnames(OR) <- c("OR", "2.5%", "97.5%", "p_value")
  return(OR)
}

varlist = colnames(dat)[!(colnames(dat) %in% c("SEQN", "bp", "bi", "cd","ncd","smok_yn","ncd_p"))]

table_uni <- do.call(rbind.data.frame, lapply(varlist, univ))
knitr::kable(table_uni)
OR 2.5% 97.5% p_value
DietVery good 1.054 0.650 1.694 0.82898
DietGood 1.340 0.855 2.077 0.19466
DietFair 2.000 1.219 3.269 0.00578
DietPoor 2.059 1.080 4.050 0.03138
Alcohol_freq 0.998 0.987 1.013 0.761
AlcoholNo 0.658 0.483 0.888 0.00685
Vitamin_C 0.999 0.998 1.001 0.432
Caffeine 1.000 0.999 1.000 0.2
Sleep 1.027 0.946 1.115 0.5267
Trouble_sleepNo 0.429 0.315 0.578 <0.001
Smoking_age 0.990 0.972 1.008 0.269
Weight 1.012 1.008 1.015 < 0.001
BMI 1.054 1.034 1.076 <0.001

Regression

Full Model

form <- formula(paste0("ncd ~" , paste(varlist, collapse = "+")))
full <- glm(form, data = dat, family = "binomial")

final <- step(full, direction = "both", trace = 0)
#summary(final)

p <- coef(summary(final))[,4]
sum_final <- data.frame(round(exp(cbind(OR = coef(final), confint(final))), 4),
           p_value = format.pval(p, digits = 3, eps = 0.001))
## Waiting for profiling to be done...
colnames(sum_final) <- c("OR", "2.5%", "97.5%", "P-val")
rownames(sum_final)[1] <- "DietExcellent"
sum_final[1,]<-c("Ref","","","")
## Warning in `[<-.factor`(`*tmp*`, iseq, value = ""): invalid factor level, NA
## generated
knitr::kable(sum_final) 
OR 2.5% 97.5% P-val
DietExcellent Ref NA
DietVery good 1.0064 0.6111 1.6442 0.9799
DietGood 1.2011 0.7547 1.8903 0.4330
DietFair 1.7663 1.0597 2.9321 0.0281
DietPoor 1.5279 0.779 3.0809 0.2249
AlcoholNo 0.7081 0.5142 0.9671 0.0320
Caffeine 0.9995 0.999 1 0.0546
Trouble_sleepNo 0.4385 0.3193 0.5963 <0.001
Weight 1.0116 1.0079 1.0156 <0.001

effect plot

## Effect displays
plot(allEffects(final))

ROC plot

rocplot <- function(truth, pred, ...){
  predob = prediction(pred, truth)
  perf = performance(predob, "tpr", "fpr")
  plot(perf, ...) 
  area = auc(truth, pred)
  area = format(round(area, 4), nsmall = 4)
  text(x = .8, y = .1, labels = paste("AUC = ", area))
  segments(x0 = 0, y0 = 0, x1 = 1, y1 = 1, col = "gray", lty = 2)
}

rocplot(dat$ncd, final$fitted.values)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls < cases

Ordinary

Data Summary

Numerically Summary

dat2 <- dat %>%
  mutate(ncd = bp + bi + cd)
dat2$ncd <- factor(dat2$ncd)
dat2 <- dat2 %>%
  dplyr::select(-smok_yn, -Weight)
head(dat2)
dat2$ncd_p <- factor(dat2$ncd, levels = 0:4, labels = c(0:3, "P-value"))

rndr2 <- function(x, name, ...){
  if (length(x) == 0){
    y <- dat2[[name]]
    s <- rep("", length(render.default(x = y, name = name)))
    if (is.numeric(y)){
      p <- kruskal.test(y ~ dat2$ncd_p)$p.value
    } else{
      p <- chisq.test(table(y, droplevels(dat2$ncd_p)))$p.value
    }
    s[2] <- sub("<", "&lt;", format.pval(p, digits = 3, eps = 0.001))
    s
  } else{
    render.default(x = x, name = name, ...)
  }
}

rndr.strat <- function(label, n, ...){
  ifelse(n == 0, label, render.strat.default(label, n, ...))
}

table1(~ Diet+Alcohol+Vitamin_C+Caffeine+Sleep+Trouble_sleep+
         Smoking_age + BMI| ncd_p, data = dat2, droplevels = F, 
       render = rndr2, render.strat = rndr.strat, overall = F)
0
(n=301)
1
(n=418)
2
(n=347)
3
(n=68)
P-value
Diet
Excellent 38 (12.6%) 47 (11.2%) 26 (7.5%) 3 (4.4%) 0.00186
Very good 74 (24.6%) 87 (20.8%) 54 (15.6%) 15 (22.1%)
Good 119 (39.5%) 166 (39.7%) 128 (36.9%) 25 (36.8%)
Fair 53 (17.6%) 89 (21.3%) 105 (30.3%) 18 (26.5%)
Poor 17 (5.6%) 29 (6.9%) 34 (9.8%) 7 (10.3%)
Alcohol
Yes 71 (23.6%) 121 (28.9%) 116 (33.4%) 29 (42.6%) 0.0041
No 230 (76.4%) 297 (71.1%) 231 (66.6%) 39 (57.4%)
Vitamin_C
Mean (SD) 75.2 (94.6) 67.3 (83.1) 76.2 (79.6) 63.4 (69.4) 0.133
Median [Min, Max] 42.8 [0.00, 745] 39.9 [0.00, 921] 49.8 [0.00, 545] 33.0 [0.400, 331]
Caffeine
Mean (SD) 232 (360) 221 (277) 184 (224) 236 (293) 0.0911
Median [Min, Max] 156 [0.00, 4530] 160 [0.00, 2950] 132 [0.00, 1920] 153 [0.00, 1760]
Sleep
Mean (SD) 7.57 (1.38) 7.74 (1.66) 7.53 (1.68) 7.60 (1.82) 0.413
Median [Min, Max] 7.50 [3.00, 11.5] 8.00 [2.50, 13.5] 7.50 [2.00, 13.0] 8.00 [2.00, 11.0]
Trouble_sleep
Yes 69 (22.9%) 147 (35.2%) 155 (44.7%) 39 (57.4%) <0.001
No 232 (77.1%) 271 (64.8%) 192 (55.3%) 29 (42.6%)
Smoking_age
Mean (SD) 18.8 (6.60) 18.4 (6.69) 18.1 (7.70) 17.6 (7.03) 0.0115
Median [Min, Max] 18.0 [0.00, 54.0] 18.0 [0.00, 49.0] 17.0 [0.00, 58.0] 16.0 [0.00, 45.0]
BMI
Mean (SD) 26.2 (5.99) 27.8 (6.63) 29.0 (7.45) 32.7 (8.89) <0.001
Median [Min, Max] 25.8 [0.00105, 53.7] 27.5 [0.000956, 55.3] 28.3 [0.000984, 57.2] 32.4 [0.00102, 64.2]

Graphical Summary

Continuous Variable: Alcohol_freq, Vitamin_C, Caffeine, Sleep, Smoking_age, BMI (Draw Boxplots for the continuous variables)

ggboxplot(data = dat2, x = "ncd", y = "Alcohol_freq", add = "jitter", color = "ncd") + 
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("How often drink alcohol over past 12 mos") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat2, x = "ncd", y = "Vitamin_C", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Vitamin C (mg)") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat2, x = "ncd", y = "Caffeine", add = "jitter", color = "ncd")+
  theme(legend.position = "none") +
  xlab("Having one or more diseases") +
  ylab("Caffeine (mg)") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat2, x = "ncd", y = "Sleep", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Sleep hours") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

ggboxplot(data = dat2, x = "ncd", y = "Smoking_age", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("Age started smoking cigarettes regularly") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

# ggboxplot(data = dat2, x = "ncd", y = "Weight", add = "jitter", color = "ncd")+
#   theme(legend.position = "none") + 
#   xlab("Having one or more diseases") +
#   ylab("Current self-reported weight (pounds)") +
#   stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "BMI", add = "jitter", color = "ncd")+
  theme(legend.position = "none") + 
  xlab("Having one or more diseases") +
  ylab("BMI") +
  stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")

Discrete variable Diet, Alcohol, Trouble_sleep (Draw Mosaic plots for discrete variables)

pval <- function(p){
  if (p < 0.001) {
    return("< 0.001")
    } else {return(round(p, 3))}
}

ggplot(data = dat2) + 
  geom_mosaic(aes(x = product(Diet, ncd), fill = Diet)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Diet") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$DBQ700, correct = F)$p.value)))

ggplot(data = dat2) + 
  geom_mosaic(aes(x = product(Alcohol, ncd), fill = Alcohol)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Alcohol") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$ALQ151, correct = F)$p.value)))

ggplot(data = dat2) + 
  geom_mosaic(aes(x = product(Trouble_sleep, ncd), fill = Trouble_sleep)) +
  xlab(label = "Having one or more diseases") +
  ylab(label = "Having trouble sleeping") +
  theme(legend.position = "none") +
  geom_text(x = .8, y = .8, 
            label = paste0("Chisq p ", 
                           pval(chisq.test(dat$ncd, dat$SLQ050, correct = F)$p.value)))

Test

Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Use two sample t-test with two-sided 0.05 significance level to compare the mean level of each variable between two NCDs outcomes)

kruskal.test(dat2$ncd,dat2$Alcohol_freq)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$Alcohol_freq
## Kruskal-Wallis chi-squared = 38.221, df = 19, p-value = 0.005563
kruskal.test(dat2$ncd,dat2$Vitamin_C)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$Vitamin_C
## Kruskal-Wallis chi-squared = 787.8, df = 795, p-value = 0.5652
kruskal.test(dat2$ncd,dat2$Caffeine)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$Caffeine
## Kruskal-Wallis chi-squared = 411.78, df = 437, p-value = 0.8016
kruskal.test(dat2$ncd,dat2$Sleep)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$Sleep
## Kruskal-Wallis chi-squared = 24.654, df = 23, p-value = 0.3684
kruskal.test(dat2$ncd,dat2$Smoking_age)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$Smoking_age
## Kruskal-Wallis chi-squared = 71.148, df = 43, p-value = 0.004433
# kruskal.test(dat2$ncd,dat2$Weight)
kruskal.test(dat2$ncd,dat2$BMI)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat2$ncd and dat2$BMI
## Kruskal-Wallis chi-squared = 757.46, df = 713, p-value = 0.1207

Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Use Chi-square tests to compare the proportions of diagnosed with NCDs for each discrete variable)

chisq.test(dat2$ncd,dat2$Diet)
## 
##  Pearson's Chi-squared test
## 
## data:  dat2$ncd and dat2$Diet
## X-squared = 31.165, df = 12, p-value = 0.001859
chisq.test(dat2$ncd,dat2$Alcohol)
## 
##  Pearson's Chi-squared test
## 
## data:  dat2$ncd and dat2$Alcohol
## X-squared = 13.265, df = 3, p-value = 0.004098
chisq.test(dat2$ncd,dat2$Trouble_sleep)
## 
##  Pearson's Chi-squared test
## 
## data:  dat2$ncd and dat2$Trouble_sleep
## X-squared = 47.138, df = 3, p-value = 3.248e-10

For each covariate fit the univariate logistic model.

Check the proportional odds assumption

# Check the Proportional odds assumption for each variable

varlist2 <- colnames(dat2)[!(colnames(dat2) %in% c("SEQN", "bp", "bi", "cd", "ncd", "ncd_p"))]
prop_test <- data.frame()
for (var in varlist2){
  form = formula(paste0("ncd ~ ", var))
  logit = clm(form, data = dat2)
  a = scale_test(logit)$`Pr(>Chi)`[2]
  b = nominal_test(logit)$`Pr(>Chi)`[2]
  OR = data.frame(round(exp(cbind(OR = coef(logit)[4], confint(logit))), 3))
  p1 <- cbind(OR, p_val = coef(summary(logit))[-1:-3, 4] < 0.05, 
              satisfy_normality = a > 0.05,
              satisfy_effect = b > 0.05)
  prop_test <- rbind(prop_test, p1)
}
## Warning: (2) Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables? 
## In addition: Absolute and relative convergence criteria were met
prop_test

All other variables are violated the PH assumption.

Univariate

comby <- function(y, j){
  ans <- data.frame()
  for (i in 1:3){
    ans <- rbind(ans, y[,,i][j, ])
  }
  ans
}

comb <- function(x, y){
  ans <- data.frame()
  i = 3
  for (i in 2:ncol(x)){
    ans1 <- cbind(x[, i], comby(y, i))
    colnames(ans1) <- c("OR", "2.5%", "97.5%")
    rownames(ans1) <- paste0(colnames(x)[i], 1:3)
    ans <- rbind(ans, ans1)
  }
  ans
}

univ <- function(var){
  form = formula(paste0("ncd ~ ", var))
  multi = multinom(form, data = dat2)
  comb(exp(coef(multi)), exp(confint(multi)))
}

t<- round(do.call(rbind.data.frame, lapply(varlist2, univ)), 3)
## # weights:  24 (15 variable)
## initial  value 1572.057806 
## iter  10 value 1424.423662
## iter  20 value 1402.831276
## final  value 1402.828975 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1414.736437
## iter  20 value 1408.522221
## final  value 1408.519204 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1412.141794
## final  value 1412.126076 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1416.988933
## final  value 1416.987734 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1419.551744
## iter  20 value 1415.542862
## final  value 1415.527198 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1416.931480
## final  value 1416.930635 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1394.893024
## final  value 1394.748706 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1417.527116
## final  value 1417.522425 
## converged
## # weights:  12 (6 variable)
## initial  value 1572.057806 
## iter  10 value 1389.613448
## final  value 1389.501830 
## converged
knitr::kable(t)
OR 2.5% 97.5%
DietVery good1 0.951 0.561 1.612
DietVery good2 1.067 0.580 1.963
DietVery good3 2.568 0.700 9.420
DietGood1 1.128 0.692 1.838
DietGood2 1.572 0.900 2.746
DietGood3 2.661 0.761 9.307
DietFair1 1.358 0.786 2.345
DietFair2 2.896 1.592 5.267
DietFair3 4.302 1.183 15.649
DietPoor1 1.379 0.661 2.878
DietPoor2 2.923 1.358 6.292
DietPoor3 5.216 1.201 22.649
Alcohol_freq1 1.004 0.991 1.017
Alcohol_freq2 0.926 0.882 0.972
Alcohol_freq3 0.868 0.776 0.972
AlcoholNo1 0.758 0.539 1.064
AlcoholNo2 0.615 0.434 0.870
AlcoholNo3 0.415 0.240 0.719
Vitamin_C1 0.999 0.997 1.001
Vitamin_C2 1.000 0.998 1.002
Vitamin_C3 0.998 0.995 1.002
Caffeine1 1.000 0.999 1.000
Caffeine2 0.999 0.999 1.000
Caffeine3 1.000 0.999 1.001
Sleep1 1.066 0.972 1.170
Sleep2 0.985 0.895 1.084
Sleep3 1.012 0.859 1.193
Trouble_sleepNo1 0.548 0.392 0.767
Trouble_sleepNo2 0.368 0.262 0.519
Trouble_sleepNo3 0.221 0.128 0.384
Smoking_age1 0.994 0.974 1.015
Smoking_age2 0.986 0.965 1.008
Smoking_age3 0.977 0.939 1.016
BMI1 1.036 1.013 1.059
BMI2 1.065 1.039 1.091
BMI3 1.141 1.100 1.184
form2 <- formula(paste0("ncd ~" , paste(varlist2, collapse = "+")))
full2 <- clm(ncd ~ Diet + Alcohol + Vitamin_C + Caffeine + 
    Sleep + Trouble_sleep + Smoking_age + BMI, data = dat2)

final2 <- step(full2, direction = "both", trace = 0)
sum_final2 <- round(cbind(exp(coef(final2)[4:12]), exp(confint(final2))),3)
colnames(sum_final2)[1] <- "OR"
DietExcellent<-c("Ref","","")
sum_final2<-rbind(DietExcellent,sum_final2)
knitr::kable(sum_final2)
OR 2.5 % 97.5 %
DietExcellent Ref
DietVery good 1.148 0.76 1.738
DietGood 1.372 0.94 2.006
DietFair 2.005 1.338 3.01
DietPoor 1.759 1.041 2.975
AlcoholNo 0.682 0.538 0.865
Caffeine 1 0.999 1
Trouble_sleepNo 0.489 0.389 0.614
Smoking_age 0.988 0.972 1.003
BMI 1.054 1.037 1.072
# FYI, Interaction: Not significant

effect plot

## Effect displays
plot(allEffects(full2))

ROC plot

multiclass.roc(dat2$ncd, final2$fitted.values)
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## 
## Call:
## multiclass.roc.default(response = dat2$ncd, predictor = final2$fitted.values)
## 
## Data: final2$fitted.values with 4 levels of dat2$ncd: 0, 1, 2, 3.
## Multi-class area under the curve: 0.8054